Clustering

library(tidyverse)
library(scatterplot3d)
library(plotly)

Load data

dir.short <- "data/individual_book_train/"
all.files.short <- list.files(dir.short)

raw_df_cl = data.frame(name = character(),
                       time_id = numeric(),
                       mean_BAS = numeric(),
                       mean_WAP = numeric(),
                       imbalance = numeric(),
                       volume = numeric(),
                       range = numeric(),
                       num_order = numeric(),
                       rush = numeric(),
                       stringsAsFactors = FALSE)

for (i in all.files.short) {
  stock = read.csv(file.path(dir.short, i))
  
  # randomly select time_ids from stock
  time_ids = sample(unique(stock$time_id), 10)
  
  file_name = gsub("stock_", "", (gsub(".csv", "", i)))
  
  stock = stock |> 
      filter(time_id %in% time_ids) |>
      mutate(WAP = (bid_price1 * ask_size1 + ask_price1 * bid_size1) / (bid_size1 + ask_size1)) |>
      mutate(BidAskSpread = ask_price1 / bid_price1 - 1) |>
      mutate(imbalance = (bid_size1 - ask_size1) / (bid_size1 + ask_size1)) |>
      mutate(volume = (ask_size1 + bid_size1)) |>
      mutate(range = (ask_price1 - bid_price1)) |>
      mutate(num_order = bid_size1 + ask_size1 + bid_size2 + ask_size2) |>
      mutate(rush = (bid_size1*bid_price1)/(ask_size1*ask_price1)) |> # Made up
      group_by(time_id) |>
      summarise(mean_BAS = mean(BidAskSpread),
                mean_WAP = mean(WAP),
                imbalance = mean(imbalance),
                volume = mean(volume),
                range = mean(range),
                num_order = mean(num_order),
                rush = mean(rush)) |>
      mutate(file_name = file_name) 
  
  raw_df_cl = rbind(raw_df_cl, stock)
  
  print(file_name)

}
[1] "0"
[1] "1"
[1] "10"
[1] "100"
[1] "101"
[1] "102"
[1] "103"
[1] "104"
[1] "105"
[1] "107"
[1] "108"
[1] "109"
[1] "11"
[1] "110"
[1] "111"
[1] "112"
[1] "113"
[1] "114"
[1] "115"
[1] "116"
[1] "118"
[1] "119"
[1] "120"
[1] "122"
[1] "123"
[1] "124"
[1] "125"
[1] "126"
[1] "13"
[1] "14"
[1] "15"
[1] "16"
[1] "17"
[1] "18"
[1] "19"
[1] "2"
[1] "20"
[1] "21"
[1] "22"
[1] "23"
[1] "26"
[1] "27"
[1] "28"
[1] "29"
[1] "3"
[1] "30"
[1] "31"
[1] "32"
[1] "33"
[1] "34"
[1] "35"
[1] "36"
[1] "37"
[1] "38"
[1] "39"
[1] "4"
[1] "40"
[1] "41"
[1] "42"
[1] "43"
[1] "44"
[1] "46"
[1] "47"
[1] "48"
[1] "5"
[1] "50"
[1] "51"
[1] "52"
[1] "53"
[1] "55"
[1] "56"
[1] "58"
[1] "59"
[1] "6"
[1] "60"
[1] "61"
[1] "62"
[1] "63"
[1] "64"
[1] "66"
[1] "67"
[1] "68"
[1] "69"
[1] "7"
[1] "70"
[1] "72"
[1] "73"
[1] "74"
[1] "75"
[1] "76"
[1] "77"
[1] "78"
[1] "8"
[1] "80"
[1] "81"
[1] "82"
[1] "83"
[1] "84"
[1] "85"
[1] "86"
[1] "87"
[1] "88"
[1] "89"
[1] "9"
[1] "90"
[1] "93"
[1] "94"
[1] "95"
[1] "96"
[1] "97"
[1] "98"
[1] "99"

Apply clustering

library(caret)

df_cl = raw_df_cl


df_cl$mean_BAS <- as.vector(scale(df_cl$mean_BAS, center = min(df_cl$mean_BAS), scale = max(df_cl$mean_BAS) - min(df_cl$mean_BAS)))
df_cl$mean_WAP = as.vector(scale(df_cl$mean_WAP, center = min(df_cl$mean_WAP), scale = max(df_cl$mean_WAP) - min(df_cl$mean_WAP)))
df_cl$imbalance = as.vector(scale(df_cl$imbalance, center = min(df_cl$imbalance), scale = max(df_cl$imbalance) - min(df_cl$imbalance)))
df_cl$volume = as.vector(scale(df_cl$volume, center = min(df_cl$volume), scale = max(df_cl$volume) - min(df_cl$volume)))
df_cl$range = as.vector(scale(df_cl$range, center = min(df_cl$range), scale = max(df_cl$range) - min(df_cl$range)))
df_cl$num_order = as.vector(scale(df_cl$num_order, center = min(df_cl$num_order), scale = max(df_cl$num_order) - min(df_cl$num_order)))
df_cl$rush = as.vector(scale(df_cl$rush, center = min(df_cl$rush), scale = max(df_cl$rush) - min(df_cl$rush)))

df_cl$file_name = as.numeric(df_cl$file_name)
df_cl = df_cl[order(df_cl$file_name),]
df_cl$name = paste(df_cl$file_name, df_cl$time_id, sep = " ")

df_cl = df_cl |>
    filter(file_name != 31) |>
    dplyr::select(-time_id, -file_name)
# |>
#     select(-mean_WAP)

# df_cl <- df_cl[c("name", "mean_BAS", "imbalance", "volume", "range", "num_order")]
# df_cl <- df_cl[c("name", "mean_BAS", "volume")] #Sucks
# df_cl <- df_cl[c("name", "mean_BAS", "imbalance")] #Sucks
# df_cl <- df_cl[c("name", "mean_BAS", "range")] # Very linear and not differentiated
# df_cl <- df_cl[c("name", "mean_BAS", "num_order")] # Same problem with bas, volume - L like distribution
# df_cl <- df_cl[c("name", "imbalance", "mean_WAP")] # Much more interesting but not quite the seperation we need
# df_cl <- df_cl[c("name", "imbalance", "volume")] # Much more interesting but not quite the seperation we need
# df_cl <- df_cl[c("name", "imbalance", "range")] # Best so far but still have significant grouping
# df_cl <- df_cl[c("name", "imbalance", "num_order")] # Very similar to imbalance, volume not great
# df_cl <- df_cl[c("name", "volume", "range")] # Very not good L like but super clumped
# df_cl <- df_cl[c("name", "volume", "num_order")] # Linear but clumped - v bad
# df_cl <- df_cl[c("name", "range", "num_order")]
# boxplot(df_cl$mean_WAP)
# boxplot(df_cl$imbalance)
boxplot(sqrt(df_cl$rush))

Both of these had the most differentiation

#df_cl <- df_cl[c("name", "imbalance", "mean_WAP", "num_order")]
#df_cl <- df_cl[c("name", "imbalance", "mean_WAP", "rush")] 

Cluster with k = 4

k = 4
rush_df_cl <- df_cl[c("name", "imbalance", "mean_WAP", "rush")] 
rush_df_cl$rush <- log(sqrt(rush_df_cl$rush + 0.001))
# df_cl$mean_WAP = log(df_cl$mean_WAP)
# df_cl$imbalance = log(df_cl$imbalance)

km.out <- kmeans(rush_df_cl[-1], centers = k, nstart = 20)

df = data.frame(
    names = df_cl$name,
    imbalance = df_cl$imbalance,
    mean_WAP = df_cl$mean_WAP,
    rush = df_cl$rush,
    cluster = factor(km.out$cluster)
)

rush_df_cl <- df_cl %>% 
  left_join(df, by = c("name" = "names", "imbalance" = "imbalance", "mean_WAP" = "mean_WAP", "rush" = "rush"))

# plot = ggplot(df, aes(x = volume, y = imbalance, z = wap, color = cluster, label = names)) + 
#   geom_point() + 
#   geom_text(aes(label=names), vjust = -1, hjust = 1) +
#   theme_minimal() +
#   labs(title = "Cluster Plot", x = "volume", y = "imbalance")
# 
# ggplotly(plot)

plot_ly(x = df$imbalance, y = df$mean_WAP, z = df$rush, type = "scatter3d", mode = "markers", marker = list(color = df$cluster))
k = 4
imbalance_df_cl <- df_cl[c("name", "imbalance", "mean_WAP", "mean_BAS")] 

km.out <- kmeans(imbalance_df_cl[-1], centers = k, nstart = 20)

df = data.frame(
    names = df_cl$name,
    mean_BAS = df_cl$mean_BAS,
    mean_WAP = df_cl$mean_WAP,
    imbalance = df_cl$imbalance,
    cluster = factor(km.out$cluster)
)

imbalance_df_cl <- df_cl %>% 
  left_join(df, by = c("name" = "names", "mean_BAS" = "mean_BAS", "mean_WAP" = "mean_WAP", "imbalance" = "imbalance"))

# plot = ggplot(df, aes(x = volume, y = imbalance, z = wap, color = cluster, label = names)) + 
#   geom_point() + 
#   geom_text(aes(label=names), vjust = -1, hjust = 1) +
#   theme_minimal() +
#   labs(title = "Cluster Plot", x = "volume", y = "imbalance")
# 
# ggplotly(plot)

plot_ly(x = df$imbalance, y = df$mean_WAP, z = df$mean_BAS, type = "scatter3d", mode = "markers", marker = list(color = df$cluster))
df_imb_pca <- select(imbalance_df_cl, -name, -cluster)

df_imb_pca_res <- prcomp(df_imb_pca, center = TRUE)
df_imb_pca_res_stand <- prcomp(df_imb_pca, center = TRUE, scale. = TRUE)
df_imb_pca_res <- as.data.frame(df_imb_pca_res$x)
df_imb_pca_res_stand <- as.data.frame(df_imb_pca_res_stand$x)

ggplot(df_imb_pca_res, aes(x = PC1, y = PC2, color = imbalance_df_cl$cluster)) +
  geom_point(alpha = 0.5) +
  theme_bw() +
  labs(title = "PCA Plot of imbalance cluster (Not standardized)",
       x = "Principal Component 1",
       y = "Principal Component 2")

ggplot(df_imb_pca_res_stand, aes(x = PC1, y = PC2, color = imbalance_df_cl$cluster)) +
  geom_point(alpha = 0.5) +
  theme_bw() +
  labs(title = "PCA Plot of imbalance cluster (Standardized)",
       x = "Principal Component 1",
       y = "Principal Component 2")

df_rush_pca <- select(rush_df_cl, -name, -cluster)

df_rush_pca_res <- prcomp(df_rush_pca, center = TRUE)
df_rush_pca_res_stand <- prcomp(df_rush_pca, center = TRUE, scale. = TRUE)
df_rush_pca_res <- as.data.frame(df_rush_pca_res$x)
df_rush_pca_res_stand <- as.data.frame(df_rush_pca_res_stand$x)

ggplot(df_rush_pca_res, aes(x = PC1, y = PC2, color = rush_df_cl$cluster)) +
  geom_point(alpha = 0.5) +
  theme_bw() +
  labs(title = "PCA Plot of rush cluster (Not standardized)",
       x = "Principal Component 1",
       y = "Principal Component 2")

ggplot(df_rush_pca_res_stand, aes(x = PC1, y = PC2, color = rush_df_cl$cluster)) +
  geom_point(alpha = 0.5) +
  theme_bw() +
  labs(title = "PCA Plot of rush cluster (Standardized)",
       x = "Principal Component 1",
       y = "Principal Component 2")